(Artificial) Neural Networks (ANN)
Table of Contents
Perceptron
XOR Problem
| $x_1$ | $x_2$ | $x_1$ XOR $x_2$ |
|---|---|---|
| 0 | 0 | 0 |
| 0 | 1 | 1 |
| 1 | 0 | 1 |
| 1 | 1 | 0 |
Neurons compute the weighted sum of their inputs
A neuron is activated or fired when the sum $a$ is positive
$$
\begin{align*}
a &= \omega_0 + \omega_1 x_1 + \omega_2 x_2 \\ \\
\hat{y} &= g(a) =
\begin{cases}
1 & a > 0\\
0 & \text{otherwise}
\end{cases}
\end{align*}
$$
Differentiable activation function
In a compact representation
Multi-layer perceptron
We can represent this āneuronā as follows:
The main weakness of linear predictors is their lack of capacity. For classiļ¬cation, the populations have to be linearly separable.
The XOR example can be solved by pre-processing the data to make the two populations linearly separable.
Universal function approximator Universal function classifier
Parameterized
Example: Linear Classifier
Example: Neural Networks
colah's blog
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import time
%matplotlib inline
#training data gerneration
m = 1000
x1 = 8*np.random.rand(m, 1)
x2 = 7*np.random.rand(m, 1) - 4
g = 0.8*x1 + x2 - 3
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([np.ones([N,1]), x1[C1], x2[C1]])
X0 = np.hstack([np.ones([M,1]), x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_y = np.vstack([np.ones([N,1]), -np.ones([M,1])])
train_X = np.asmatrix(train_X)
train_y = np.asmatrix(train_y)
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.show()
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
train_y = np.asmatrix(train_y)
import tensorflow as tf
LR = 0.05
n_iter = 10000
x = tf.placeholder(tf.float32, [None, 3])
y = tf.placeholder(tf.float32, [None, 1])
w = tf.Variable(tf.random_normal([3,1]))
# y_pred넼 구ķģ¬ģ¼ ķØ
y_pred = tf.matmul(?,?)
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits = y_pred, labels = y)
loss = tf.reduce_mean(loss)
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
init = tf.global_variables_initializer()
# xģ yģ ķģµķ ė°ģ“ķ°ė„¼ 주ģ
ķ“ģ¼ķØ
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: ?, y: ?})
w_hat = sess.run(w)
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w_hat[1,0]/w_hat[2,0]*x1p - w_hat[0,0]/w_hat[2,0]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
# define input and output size
n_input = 3
n_output = 1
# define weights as a dictionary
weights = {
'output' : tf.Variable(tf.random_normal([n_input, n_output], stddev = 0.1))
}
# define placeholders for train_x and train_y
x = tf.placeholder(tf.float32, [None, ?])
y = tf.placeholder(tf.float32, [None, ?])
# define network architecture
def build_model(x, weights):
output = tf.matmul(x, weights['output'])
return output
# define loss
pred = build_model(x, weights)
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits = pred, labels = y)
loss = tf.reduce_mean(loss)
LR = 0.05
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
n_batch = 50 # Batch size
n_iter = 10000 # Learning iteration
n_prt = 250 # Print cycle
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# training or learning
loss_record = []
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
if epoch % n_prt == 0:
loss_record.append(sess.run(loss, feed_dict = {x: train_X, y: train_y}))
w_hat = sess.run(weights['output'])
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record)
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.show()
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w_hat[1,0]/w_hat[2,0]*x1p - w_hat[0,0]/w_hat[2,0]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
Weights and Bias
n_input = 2
n_output = 1
train_X = train_X[:,1:3]
# define network
def build_model(x, weights, biases):
output = tf.add(tf.matmul(x, weights['output']), biases['output'])
return output
weights = {
'output' : tf.Variable(tf.random_normal([n_input, n_output], stddev = 0.1))
}
biases = {
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1))
}
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
# ģģø”ź°ź³¼ ģ ėµģ ģ°Øģ“넼 ė¹źµķ“ģ¼ķØ
pred = build_model(x, weights, biases)
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=?, labels=?)
loss = tf.reduce_mean(loss)
LR = 0.05
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
n_batch = 50
n_iter = 15000
n_prt = 250
loss_record = []
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
if epoch % n_prt == 0:
loss_record.append(sess.run(loss, feed_dict = {x: train_X, y: train_y}))
w_hat = sess.run(weights['output'])
b_hat = sess.run(biases['output'])
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record)
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.show()
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
One-hot Encoding
$$y^{(i)} \in \{1,0\} \quad \implies \quad y^{(i)} \in \{[0,1],[1,0]\}$$
tf.nn.sigmoid_cross_entropy_with_logits $\rightarrow$ tf.nn.softmax_cross_entropy_with_logitsfrom sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')
train_y = ohe.fit_transform(train_y).toarray()
print(train_y)
# output nodeģ ź°ģź° ė³ķØ
n_input = 2
n_output = ?
weights = {
'output' : tf.Variable(tf.random_normal([n_input, n_output], stddev = 0.1))
}
biases = {
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1))
}
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
pred = build_model(x, weights, biases)
loss = tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)
loss = tf.reduce_mean(loss)
LR = 0.05
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
n_batch = 50
n_iter = 10000
n_prt = 250
loss_record = []
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
if epoch % n_prt == 0:
loss_record.append(sess.run(loss, feed_dict = {x: train_X, y: train_y}))
w_hat = sess.run(weights['output'])
b_hat = sess.run(biases['output'])
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record)
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.show()
print(w_hat)
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
x3p = - w_hat[0,1]/w_hat[1,1]*x1p - b_hat[1]/w_hat[1,1]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
# training data gerneration
m = 1000
x1 = 10*np.random.rand(m, 1) - 5
x2 = 8*np.random.rand(m, 1) - 4
g = - 0.5*(x1-1)**2 + 2*x2 + 5
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_X = np.asmatrix(train_X)
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
ohe = OneHotEncoder(handle_unknown='ignore')
train_y = ohe.fit_transform(train_y).toarray()
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
# nodeģ ź°ģ
n_input = ?
n_hidden = ?
n_output = ?
# weights (ķģ“ķ)ģ ź°ģ
weights = {
'hidden' : tf.Variable(tf.random_normal([n_input, ?], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([?, n_output], stddev = 0.1))
}
biases = {
'hidden' : tf.Variable(tf.random_normal([n_hidden], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1))
}
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
def build_model(x, weights, biases):
hidden = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
hidden = tf.nn.sigmoid(hidden)
output = tf.add(tf.matmul(hidden, weights['output']), biases['output'])
return output
# ģģø”ź°ź³¼ ģ ėµģ ė¹źµ
pred = build_model(x, weights, biases)
loss = tf.nn.softmax_cross_entropy_with_logits(logits = ?, labels = ?)
loss = tf.reduce_mean(loss)
LR = 0.01
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
n_batch = 50
n_iter = 30000
n_prt = 250
# ķģµķ ė°ģ“ķ° ģ£¼ģ
loss_record = []
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: ?, y: ?})
if epoch % n_prt == 0:
loss_record.append(sess.run(loss, feed_dict = {x: ?, y: ?}))
w_hat = sess.run(weights)
b_hat = sess.run(biases)
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record)
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.show()
H = train_X*w_hat['hidden'] + b_hat['hidden']
H = 1/(1 + np.exp(-H))
plt.figure(figsize=(10, 8))
plt.plot(H[0:N,0], H[0:N,1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(H[N:m,0], H[N:m,1], 'bo', alpha = 0.4, label = 'C0')
plt.xlabel('$z_1$', fontsize = 15)
plt.ylabel('$z_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
x1p = np.arange(0, 1, 0.01).reshape(-1, 1)
x2p = - w_hat['output'][0,0]/w_hat['output'][1,0]*x1p - b_hat['output'][0]/w_hat['output'][1,0]
x3p = - w_hat['output'][0,1]/w_hat['output'][1,1]*x1p - b_hat['output'][1]/w_hat['output'][1,1]
plt.figure(figsize=(10, 8))
plt.plot(H[0:N,0], H[0:N,1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(H[N:m,0], H[N:m,1], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlabel('$z_1$', fontsize = 15)
plt.ylabel('$z_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w_hat['hidden'][0,0]/w_hat['hidden'][1,0]*x1p - b_hat['hidden'][0]/w_hat['hidden'][1,0]
x3p = - w_hat['hidden'][0,1]/w_hat['hidden'][1,1]*x1p - b_hat['hidden'][1]/w_hat['hidden'][1,1]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
# training data gerneration
m = 1000
x1 = 10*np.random.rand(m, 1) - 5
x2 = 8*np.random.rand(m, 1) - 4
g = - 0.5*(x1*x2-1)**2 + 2*x2 + 5
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_X = np.asmatrix(train_X)
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
ohe = OneHotEncoder(handle_unknown='ignore')
train_y = ohe.fit_transform(train_y).toarray()
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
n_input = ?
n_hidden = ?
n_output = ?
def build_model(x, weights, biases):
hidden = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
hidden = tf.nn.sigmoid(hidden)
output = tf.add(tf.matmul(hidden, weights['output']), biases['output'])
return output
weights = {
'hidden' : tf.Variable(tf.random_normal([?, ?], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([?, ?], stddev = 0.1))
}
biases = {
'hidden' : tf.Variable(tf.random_normal([?], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([?], stddev = 0.1))
}
x = tf.placeholder(tf.float32, [None, ?])
y = tf.placeholder(tf.float32, [None, ?])
pred = build_model(?, ?, ?)
loss = tf.nn.softmax_cross_entropy_with_logits(logits = ?, labels = ?)
loss = tf.reduce_mean(?)
LR = 0.01
optm = tf.train.GradientDescentOptimizer(LR).minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
n_batch = 50
n_iter = 40000
n_prt = 250
# Training cycle
loss_record = []
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: ?, y: ?})
if epoch % n_prt == 0:
loss_record.append(sess.run(loss, feed_dict = {x: ?, y: ?}))
w_hat = sess.run(weights)
b_hat = sess.run(biases)
# plots
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record)
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.show()
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w_hat['hidden'][0,0]/w_hat['hidden'][1,0]*x1p - b_hat['hidden'][0]/w_hat['hidden'][1,0]
x3p = - w_hat['hidden'][0,1]/w_hat['hidden'][1,1]*x1p - b_hat['hidden'][1]/w_hat['hidden'][1,1]
x4p = - w_hat['hidden'][0,2]/w_hat['hidden'][1,2]*x1p - b_hat['hidden'][2]/w_hat['hidden'][1,2]
x5p = - w_hat['hidden'][0,3]/w_hat['hidden'][1,3]*x1p - b_hat['hidden'][3]/w_hat['hidden'][1,3]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.plot(x1p, x4p, 'm', linewidth = 3, label = '')
plt.plot(x1p, x5p, 'c', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
One of the central ideas of computer science
Depends on solutions to smaller instances of the same problem ( = subproblem)
Function to call itself (it is impossible in the real world)
$$n ! = n \cdot (n-1) \cdots 2 \cdot 1$$
n = 5
m = 1
for i in range(n):
m = m*(i+1)
print(m)
def fac(n):
if n == 1:
return 1
else:
return n*fac(n-1)
# recursive
fac(5)
Dynamic Programming: general, powerful algorithm design technique
Fibonacci numbers:
# naive Fibonacci
# write a function called fib
fib(10)
# Memorized DP Fibonacci
def mfib(n):
global memo
if memo[n-1] != 0:
return memo[n-1]
elif n <= 2:
memo[n-1] = 1
return memo[n-1]
else:
memo[n-1] = mfib(n-1) + mfib(n-2)
return memo[n-1]
import numpy as np
n = 10
memo = np.zeros(n)
mfib(n)
n = 30
%timeit fib(30)
memo = np.zeros(n)
%timeit mfib(30)
$=$ Learning or estimating weights and biases of multi-layer perceptron from training data
3 key components
In mathematical expression
$$\begin{align*}
\min_{\omega} \quad &f(\omega)
\end{align*}
$$
$$ \min_{\omega} \sum_{i=1}^{m}\ell\left( h_{\omega}\left(x^{(i)}\right),y^{(i)}\right)$$
Learning weights and biases from data using gradient descent
Backpropagation
Chain Rule
Computing the derivative of the composition of functions
$\space f(g(x))' = f'(g(x))g'(x)$
$\space {dz \over dx} = {dz \over dy} \bullet {dy \over dx}$
$\space {dz \over dw} = ({dz \over dy} \bullet {dy \over dx}) \bullet {dx \over dw}$
$\space {dz \over du} = ({dz \over dy} \bullet {dy \over dx} \bullet {dx \over dw}) \bullet {dw \over du}$
Backpropagation
Update weights recursively with memory
Optimization procedure
Summary
From Wikipedia
More here
We will be using MNIST to create a Multinomial Classifier that can detect if the MNIST image shown is a member of class 0,1,2,3,4,5,6,7,8 or 9. Susinctly, we're teaching a computer to recognize hand written digets.
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="0"
# Import Library
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline
Let's download and load the dataset.
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
print ("The training data set is:\n")
print (mnist.train.images.shape)
print (mnist.train.labels.shape)
print ("The test data set is:")
print (mnist.test.images.shape)
print (mnist.test.labels.shape)
Display a few random samples from it:
mnist.train.images[5]
# well, that's not a picture (or image), it's an array.
mnist.train.images[5].shape
You might think the training set is made up of 28 $\times$28 grayscale images of handwritten digits. No !!!
The thing is, the image has been flattened. These are 28x28 images that have been flattened into a 1D array. Let's reshape one.
img = np.reshape(mnist.train.images[5], [28,28])
img = mnist.train.images[5].reshape([28,28])
# So now we have a 28x28 matrix, where each element is an intensity level from 0 to 1.
img.shape
Let's visualize what some of these images and their corresponding training labels look like.
plt.figure(figsize = (6,6))
plt.imshow(img, 'gray')
plt.xticks([])
plt.yticks([])
plt.show()
mnist.train.labels[5]
np.argmax(mnist.train.labels[5])
Batch maker embedded
x, y = mnist.train.next_batch(3)
print(x.shape)
print(y.shape)
# Import Library
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
train_x, train_y = mnist.train.next_batch(1)
img = train_x[0,:].reshape(28,28)
plt.figure(figsize=(6,6))
plt.imshow(img,'gray')
plt.title("Label : {}".format(np.argmax(train_y[0,:])))
plt.xticks([])
plt.yticks([])
plt.show()
One hot encoding
print ('Train labels : {}'.format(train_y[0, :]))
n_input = ?
n_hidden = ?
n_output = ?
weights = {
'hidden' : ?,
'output' : ?
}
biases = {
'hidden' : ?,
'output' : ?
}
x = tf.placeholder(?)
y = tf.placeholder(?)
First, the layer performs several matrix multiplication to produce a set of linear activations
Second, each linear activation is running through a nonlinear activation function
Third, predict values with an affine transformation
# Define Network
def build_model(x, weights, biases):
# first hidden layer
hidden = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
# non-linear activate function
hidden = tf.nn.relu(hidden)
# Output layer
output = tf.add(tf.matmul(hidden, weights['output']), biases['output'])
return output
Loss
Optimizer
# Define Loss
pred = build_model(x, weights, biases)
loss = tf.nn.softmax_cross_entropy_with_logits(logits = pred, labels = y)
loss = tf.reduce_mean(loss)
LR = 0.0001
optm = tf.train.AdamOptimizer(LR).minimize(loss)
n_batch: batch size for mini-batch gradient descentn_iter: the number of iteration stepsn_prt: check loss for every n_prt iterationInitializer
n_batch = 50 # Batch Size
n_iter = 3000 # Learning Iteration
n_prt = 250 # Print Cycle
# open session
# define init
# run init
?
?
?
# c1 is for train
# c2 is for test
loss_record_train = []
loss_record_test = []
for epoch in range(n_iter):
train_x, train_y = mnist.train.next_batch(n_batch)
sess.run(optm, feed_dict = {x: train_x, y: train_y})
if epoch % n_prt == 0:
test_x, test_y = mnist.test.next_batch(n_batch)
c1 = sess.run(loss, feed_dict = {x: train_x, y: train_y})
c2 = sess.run(loss, feed_dict = {x: ?, y: ?})
loss_record_train.append(c1)
loss_record_test.append(c2)
print ("Iter : {}".format(epoch))
print ("Cost : {}".format(c1))
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record_train))*n_prt,
loss_record_train, label = 'training')
plt.plot(np.arange(len(loss_record_test))*n_prt,
loss_record_test, label = 'testing')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.legend(fontsize = 12)
plt.ylim([0, np.max(loss_record_train)])
plt.show()
test_x, test_y = mnist.test.next_batch(100)
my_pred = sess.run(pred, feed_dict = {x : test_x})
my_pred = np.argmax(my_pred, axis = 1)
labels = np.argmax(test_y, axis = 1)
accr = np.mean(np.equal(my_pred, labels))
print("Accuracy : {}%".format(accr*100))
test_x, test_y = mnist.test.next_batch(1)
logits = sess.run(tf.nn.softmax(pred), feed_dict = {x : test_x})
predict = np.argmax(logits)
plt.figure(figsize = (6,6))
plt.imshow(test_x.reshape(28,28), 'gray')
plt.xticks([])
plt.yticks([])
plt.show()
print('Prediction : {}'.format(predict))
np.set_printoptions(precision = 2, suppress = True)
print('Probability : {}'.format(logits.ravel()))
You may observe that the accuracy on the test dataset is a little lower than the accuracy on the training dataset. This gap between training accuracy and test accuracy is an example of overfitting, when a machine learning model performs worse on new data than on its training data.
What is the highest accuracy you can achieve with this first fully connected model? Since the handwritten digit classification task is pretty straightforward, you may be wondering how we can do better...
$\Rightarrow$ As we saw in lecture, convolutional neural networks (CNNs) are particularly well-suited for a variety of tasks in computer vision, and have achieved near-perfect accuracies on the MNIST dataset. We will build a CNN and ultimately output a probability distribution over the 10 digit classes (0-9) in the next lectures.
Definition
Dimension Reduction
It is like 'deep learning version' of unsupervised learning.
Definition
Encoder and Decoder
$$ \mathbb{E} \left[ \lVert X - g \circ f(X) \rVert^2 \right] \approx 0$$
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="0"
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
train_idx = ((np.argmax(mnist.train.labels, 1) == 1) | \
(np.argmax(mnist.train.labels, 1) == 5) | \
(np.argmax(mnist.train.labels, 1) == 6))
test_idx = ((np.argmax(mnist.test.labels, 1) == 1) | \
(np.argmax(mnist.test.labels, 1) == 5) | \
(np.argmax(mnist.test.labels, 1) == 6))
train_imgs = mnist.train.images[train_idx]
train_labels = mnist.train.labels[train_idx]
test_imgs = mnist.test.images[test_idx]
test_labels = mnist.test.labels[test_idx]
n_train = train_imgs.shape[0]
n_test = test_imgs.shape[0]
print ("The number of training images : {}, shape : {}".format(n_train, train_imgs.shape))
print ("The number of testing images : {}, shape : {}".format(n_test, test_imgs.shape))
# Shape of input and latent variable
n_input = ?
# Encoder structure
n_encoder1 = ?
n_encoder2 = ?
n_latent = ?
# Decoder structure
n_decoder2 = ?
n_decoder1 = ?
weights = {
'encoder1' : tf.Variable(tf.random_normal([n_input, n_encoder1], stddev = 0.1)),
'encoder2' : tf.Variable(tf.random_normal([n_encoder1, n_encoder2], stddev = 0.1)),
'latent' : tf.Variable(tf.random_normal([n_encoder2, n_latent], stddev = 0.1)),
'decoder2' : tf.Variable(tf.random_normal([n_latent, n_decoder2], stddev = 0.1)),
'decoder1' : tf.Variable(tf.random_normal([n_decoder2, n_decoder1], stddev = 0.1)),
'reconst' : tf.Variable(tf.random_normal([n_decoder1, n_input], stddev = 0.1))
}
biases = {
'encoder1' : tf.Variable(tf.random_normal([?], stddev = 0.1)),
'encoder2' : tf.Variable(tf.random_normal([?], stddev = 0.1)),
'latent' : tf.Variable(tf.random_normal([?], stddev = 0.1)),
'decoder2' : tf.Variable(tf.random_normal([?], stddev = 0.1)),
'decoder1' : tf.Variable(tf.random_normal([?], stddev = 0.1)),
'reconst' : tf.Variable(tf.random_normal([?], stddev = 0.1))
}
x = tf.placeholder(tf.float32, [None, n_input])
Encoder
tanh for a nonlinear activation functionlatent is not applied with a nonlinear activation functionDecoder
tanh for a nonlinear activation functionreconst is not applied with a nonlinear activation function
def encoder(x, weights, biases):
encoder1 = tf.add(tf.matmul(x, weights['encoder1']), biases['encoder1'])
encoder1 = tf.nn.tanh(encoder1)
encoder2 = tf.add(tf.matmul(encoder1, weights['encoder2']), biases['encoder2'])
encoder2 = tf.nn.tanh(encoder2)
latent = tf.add(tf.matmul(encoder2, weights['latent']), biases['latent'])
return latent
def decoder(latent, weights, biases):
decoder2 = tf.add(tf.matmul(latent, weights['decoder2']), biases['decoder2'])
decoder2 = tf.nn.tanh(decoder2)
decoder1 = tf.add(tf.matmul(decoder2, weights['decoder1']), biases['decoder1'])
decoder1 = tf.nn.tanh(decoder1)
reconst = tf.add(tf.matmul(decoder1, weights['reconst']), biases['reconst'])
return reconst
Loss
Optimizer
LR = 0.0001
latent = encoder(x, weights, biases)
reconst = decoder(latent, weights, biases)
loss = tf.square(tf.subtract(x, reconst))
loss = tf.reduce_mean(loss)
optm = tf.train.AdamOptimizer(LR).minimize(loss)
n_batch : batch size for mini-batch gradient descentn_iter : the number of iteration stepsn_prt : check loss for every n_prt iterationn_batch = 50
n_iter = 2500
n_prt = 250
def train_batch_maker(batch_size):
random_idx = np.random.randint(n_train, size = batch_size)
return train_imgs[random_idx], train_labels[random_idx]
def test_batch_maker(batch_size):
random_idx = np.random.randint(n_test, size = batch_size)
return test_imgs[random_idx], test_labels[random_idx]
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
loss_record_train = []
loss_record_test = []
for epoch in range(n_iter):
train_x, _ = train_batch_maker(n_batch)
sess.run(optm, feed_dict = {x : train_x})
if epoch % n_prt == 0:
test_x, _ = test_batch_maker(n_batch)
c1 = sess.run(loss, feed_dict = {x: train_x})
c2 = sess.run(loss, feed_dict = {x: test_x})
loss_record_train.append(c1)
loss_record_test.append(c2)
print ("Iter : {}".format(epoch))
print ("Cost : {}".format(c1))
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(loss_record_train))*n_prt, loss_record_train, label = 'training')
plt.plot(np.arange(len(loss_record_test))*n_prt, loss_record_test, label = 'testing')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.legend(fontsize = 12)
plt.ylim([0,np.max(loss_record_train)])
plt.show()
test_x, _ = test_batch_maker(1)
x_reconst = sess.run(reconst, feed_dict = {x: test_x})
plt.figure(figsize = (10,8))
plt.subplot(1,2,1)
plt.imshow(test_x.reshape(28,28), 'gray')
plt.title('Input Image', fontsize = 15)
plt.xticks([])
plt.yticks([])
plt.subplot(1,2,2)
plt.imshow(x_reconst.reshape(28,28), 'gray')
plt.title('Reconstructed Image', fontsize = 15)
plt.xticks([])
plt.yticks([])
plt.show()
test_x, test_y = test_batch_maker(500)
test_y = np.argmax(test_y, axis = 1)
test_latent = sess.run(latent, feed_dict = {x: test_x})
plt.figure(figsize = (10,10))
plt.scatter(test_latent[test_y == 1,0], test_latent[test_y == 1,1], label = '1')
plt.scatter(test_latent[test_y == 5,0], test_latent[test_y == 5,1], label = '5')
plt.scatter(test_latent[test_y == 6,0], test_latent[test_y == 6,1], label = '6')
plt.title('Latent Space', fontsize=15)
plt.xlabel('Z1', fontsize=15)
plt.ylabel('Z2', fontsize=15)
plt.legend(fontsize = 15)
plt.axis('equal')
plt.show()
Data Generation
It generates something that makes sense.
These results are unsatisfying, because the density model used on the latent space ā± is too simple and inadequate.
Building a āgoodā model amounts to our original problem of modeling an empirical distribution, although it may now be in a lower dimension space.
This is a motivation to VAE or GAN.
new_data = np.array([[-4, 0]])
latent_input = tf.placeholder(tf.float32, [None, n_latent])
reconst = decoder(latent_input, weights, biases)
fake_image = sess.run(reconst, feed_dict = {latent_input: new_data})
plt.figure(figsize=(16,7))
plt.subplot(1,2,1)
plt.scatter(test_latent[test_y == 1,0], test_latent[test_y == 1,1], label = '1')
plt.scatter(test_latent[test_y == 5,0], test_latent[test_y == 5,1], label = '5')
plt.scatter(test_latent[test_y == 6,0], test_latent[test_y == 6,1], label = '6')
plt.scatter(new_data[:,0], new_data[:,1], c = 'k', marker = 'o', s = 200, label = 'new data')
plt.title('Latent Space', fontsize = 15)
plt.xlabel('Z1', fontsize = 15)
plt.ylabel('Z2', fontsize = 15)
plt.legend(loc = 2, fontsize = 12)
plt.axis('equal')
plt.subplot(1,2,2)
plt.imshow(fake_image.reshape(28,28), 'gray')
plt.title('Generated Fake Image', fontsize = 15)
plt.xticks([])
plt.yticks([])
plt.show()
Image Generation
# Initialize canvas
nx = 20
ny = 20
x_values = np.linspace(-8, 4, nx)
y_values = np.linspace(-4, 6, ny)
canvas = np.empty((28*ny, 28*nx))
# Define placeholder
latent_input = tf.placeholder(tf.float32, [None, n_latent])
reconst = decoder(latent_input, weights, biases)
for i, yi in enumerate(y_values):
for j, xi in enumerate(x_values):
latent_ = np.array([[xi, yi]])
reconst_ = sess.run(reconst, feed_dict = {latent_input: latent_})
canvas[(nx-i-1)*28:(nx-i)*28,j*28:(j+1)*28] = reconst_.reshape(28, 28)
plt.figure(figsize = (16, 7))
plt.subplot(1,2,1)
plt.scatter(test_latent[test_y == 1,0], test_latent[test_y == 1,1], label = '1')
plt.scatter(test_latent[test_y == 5,0], test_latent[test_y == 5,1], label = '5')
plt.scatter(test_latent[test_y == 6,0], test_latent[test_y == 6,1], label = '6')
plt.title('Latent Space', fontsize = 15)
plt.xlabel('Z1', fontsize = 15)
plt.ylabel('Z2', fontsize = 15)
plt.legend(fontsize = 12)
plt.axis('equal')
plt.subplot(1,2,2)
plt.imshow(canvas, 'gray')
plt.title('Manifold', fontsize = 15)
plt.xlabel('Z1', fontsize = 15)
plt.ylabel('Z2', fontsize = 15)
plt.xticks([])
plt.yticks([])
plt.show()
To get an intuition of the latent representation, we can pick two samples š„ and š„ā² at random and interpolate samples along the line in the latent space
$$g((1-\alpha)f(x) + \alpha f(x'))$$%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')